In [1]:
# Import libraries
import pandas
import numpy as np
import json
from pprint import pprint
import re
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)

import cufflinks as cf
print("cufflinks version: " + cf.__version__)
cf.set_config_file(offline=True, world_readable=True, theme='ggplot')
cufflinks version: 0.12.0
In [2]:
# Read dataset
df = pandas.read_json('../data/greek_publications_data_v3.json')

# Drop_data_count is a dict that stores the total rows number after
# processing steps that drop data. This is for later reference,
# to ensure that the amount of dropped data does not affect
# the statistics.
total_rows = df['item_id'].count()
print ("Total rows: "+str(total_rows))
print("Values and percentage missing from each column:")
for key in df.keys():
    missing = total_rows-df[key].count()
    percentage = " (%1.2f%%)" % (float(missing)/float(total_rows) * 100.0)
    print(str(missing)+str(percentage)+"\t"+key)
Total rows: 252533
Values and percentage missing from each column:
470 (0.19%)	affiliation
0 (0.00%)	author
473 (0.19%)	country
0 (0.00%)	item_id
0 (0.00%)	year
In [3]:
# Print a sample of the dataset
df.head()
Out[3]:
affiliation author country item_id year
0 University of Delaware, Newark, United States Nikolakis Vladimiros United States 2589 2012
1 University of Delaware, Newark, United States Vlachos Dionisios G. United States 2589 2012
2 IRCCS Fondazione Salvatore Maugeri, Pavia, Italy Moscato Gianna Italy 2595 2011
3 Centro de Regulacion Genomica, Barcelona, Spain Nikolaou Christoforos Spain 2602 2009
4 Goethe Univ Frankfurt, Dept Neurosurg, Kliniku... Archavlis Eleftherios Germany 2603 2013
In [4]:
# Remove entries with invalid years
total_rows_before = df['item_id'].count()
df = df.dropna(subset=['year'])
df=df.loc[df['year'] > 0]
total_rows_after = df['item_id'].count()
dropped = total_rows_before - total_rows_after
print("Dropped "+str(dropped)+" values with invalid year")

# Remove entries with empty or null countries
total_rows_before = df['item_id'].count()
df = df.dropna(subset=['country'])
dropped = total_rows_before - total_rows_after
print("Dropped "+str(dropped)+" values with invalid country")
Dropped 0 values with invalid year
Dropped 0 values with invalid country
In [5]:
# Total unique country names in dataset
df.groupby('country').size().count()
Out[5]:
1141
In [6]:
# Create a dictionary that maps country names
# to ISO 3166-1-alpha-2 country codes
with open('../data/countries.geojson') as data_file:    
  geojson = json.load(data_file)
countries = {}
for x in geojson['features']:
  countries.update({x['properties']['admin']: x['properties']['iso_a2']})

# Pring mapping of countries. There are 247 countries in the world.
# Sovereign states are not included in this list. 
countries
Out[6]:
{'Afghanistan': 'AF',
 'Albania': 'AL',
 'Algeria': 'DZ',
 'Angola': 'AO',
 'Antarctica': 'AQ',
 'Argentina': 'AR',
 'Armenia': 'AM',
 'Australia': 'AU',
 'Austria': 'AT',
 'Azerbaijan': 'AZ',
 'Bangladesh': 'BD',
 'Belarus': 'BY',
 'Belgium': 'BE',
 'Belize': 'BZ',
 'Benin': 'BJ',
 'Bhutan': 'BT',
 'Bolivia': 'BO',
 'Bosnia and Herzegovina': 'BA',
 'Botswana': 'BW',
 'Brazil': 'BR',
 'Brunei': 'BN',
 'Bulgaria': 'BG',
 'Burkina Faso': 'BF',
 'Burundi': 'BI',
 'Cambodia': 'KH',
 'Cameroon': 'CM',
 'Canada': 'CA',
 'Central African Republic': 'CF',
 'Chad': 'TD',
 'Chile': 'CL',
 'China': 'CN',
 'Colombia': 'CO',
 'Costa Rica': 'CR',
 'Croatia': 'HR',
 'Cuba': 'CU',
 'Cyprus': 'CY',
 'Czech Republic': 'CZ',
 'Democratic Republic of the Congo': 'CD',
 'Denmark': 'DK',
 'Djibouti': 'DJ',
 'Dominican Republic': 'DO',
 'East Timor': 'TL',
 'Ecuador': 'EC',
 'Egypt': 'EG',
 'El Salvador': 'SV',
 'Equatorial Guinea': 'GQ',
 'Eritrea': 'ER',
 'Estonia': 'EE',
 'Ethiopia': 'ET',
 'Falkland Islands': 'FK',
 'Fiji': 'FJ',
 'Finland': 'FI',
 'France': 'FR',
 'French Southern and Antarctic Lands': 'TF',
 'Gabon': 'GA',
 'Gambia': 'GM',
 'Georgia': 'GE',
 'Germany': 'DE',
 'Ghana': 'GH',
 'Greece': 'GR',
 'Greenland': 'GL',
 'Guatemala': 'GT',
 'Guinea': 'GN',
 'Guinea Bissau': 'GW',
 'Guyana': 'GY',
 'Haiti': 'HT',
 'Honduras': 'HN',
 'Hungary': 'HU',
 'Iceland': 'IS',
 'India': 'IN',
 'Indonesia': 'ID',
 'Iran': 'IR',
 'Iraq': 'IQ',
 'Ireland': 'IE',
 'Israel': 'IL',
 'Italy': 'IT',
 'Ivory Coast': 'CI',
 'Jamaica': 'JM',
 'Japan': 'JP',
 'Jordan': 'JO',
 'Kazakhstan': 'KZ',
 'Kenya': 'KE',
 'Kosovo': '-99',
 'Kuwait': 'KW',
 'Kyrgyzstan': 'KG',
 'Laos': 'LA',
 'Latvia': 'LV',
 'Lebanon': 'LB',
 'Lesotho': 'LS',
 'Liberia': 'LR',
 'Libya': 'LY',
 'Lithuania': 'LT',
 'Luxembourg': 'LU',
 'Macedonia': 'MK',
 'Madagascar': 'MG',
 'Malawi': 'MW',
 'Malaysia': 'MY',
 'Mali': 'ML',
 'Mauritania': 'MR',
 'Mexico': 'MX',
 'Moldova': 'MD',
 'Mongolia': 'MN',
 'Montenegro': 'ME',
 'Morocco': 'MA',
 'Mozambique': 'MZ',
 'Myanmar': 'MM',
 'Namibia': 'NA',
 'Nepal': 'NP',
 'Netherlands': 'NL',
 'New Caledonia': 'NC',
 'New Zealand': 'NZ',
 'Nicaragua': 'NI',
 'Niger': 'NE',
 'Nigeria': 'NG',
 'North Korea': 'KP',
 'Northern Cyprus': '-99',
 'Norway': 'NO',
 'Oman': 'OM',
 'Pakistan': 'PK',
 'Palestine': 'PS',
 'Panama': 'PA',
 'Papua New Guinea': 'PG',
 'Paraguay': 'PY',
 'Peru': 'PE',
 'Philippines': 'PH',
 'Poland': 'PL',
 'Portugal': 'PT',
 'Puerto Rico': 'PR',
 'Qatar': 'QA',
 'Republic of Congo': 'CG',
 'Republic of Serbia': 'RS',
 'Romania': 'RO',
 'Russia': 'RU',
 'Rwanda': 'RW',
 'Saudi Arabia': 'SA',
 'Senegal': 'SN',
 'Sierra Leone': 'SL',
 'Slovakia': 'SK',
 'Slovenia': 'SI',
 'Solomon Islands': 'SB',
 'Somalia': 'SO',
 'Somaliland': '-99',
 'South Africa': 'ZA',
 'South Korea': 'KR',
 'South Sudan': 'SS',
 'Spain': 'ES',
 'Sri Lanka': 'LK',
 'Sudan': 'SD',
 'Suriname': 'SR',
 'Swaziland': 'SZ',
 'Sweden': 'SE',
 'Switzerland': 'CH',
 'Syria': 'SY',
 'Taiwan': 'TW',
 'Tajikistan': 'TJ',
 'Thailand': 'TH',
 'The Bahamas': 'BS',
 'Togo': 'TG',
 'Trinidad and Tobago': 'TT',
 'Tunisia': 'TN',
 'Turkey': 'TR',
 'Turkmenistan': 'TM',
 'Uganda': 'UG',
 'Ukraine': 'UA',
 'United Arab Emirates': 'AE',
 'United Kingdom': 'GB',
 'United Republic of Tanzania': 'TZ',
 'United States of America': 'US',
 'Uruguay': 'UY',
 'Uzbekistan': 'UZ',
 'Vanuatu': 'VU',
 'Venezuela': 'VE',
 'Vietnam': 'VN',
 'Western Sahara': 'EH',
 'Yemen': 'YE',
 'Zambia': 'ZM',
 'Zimbabwe': 'ZW'}
In [7]:
# Add manualy some more country name to code mappings
# specific to the given dataset.
countries.update({
 'USA': 'US',
 'brasil': 'BR',
 'byelarus': 'BY',
 'england': 'UK',
 'españa': 'ES',
 'french guiana': 'FR',
 'french polynesia': 'FR',
 'gr': 'GR',
 'méxico': 'MX',
 'scotland': 'GB',
 'serbia': 'RS',
 'u arab emirates': 'AE',
 'united states': 'US',
 'viet nam': 'VN',
 'wales': 'GB'
})

# Create an unknown countries set that will store country names
# from the dataset that could not be mapped to an appropriate
# country code or sovereign states that are not included in the
# countries list.
unknown_countries = set()

# Transform country to iso code function.
# It searches the country string for a substring that maches a country name
# and it returns the iso code or 'UNKNOWN'
def transform_country(country):
  country = str(country).translate('.,')
  c = country.lower()
  for name,code in countries.items():
    if c.find(name.lower()) >= 0:
      return code
  unknown_countries.add(c)
  return 'UNKNOWN'
In [8]:
# Appy the transformation from country names to iso codes to the dataset.
df['country'] = df['country'].apply(lambda country: transform_country(country))
df.head()
Out[8]:
affiliation author country item_id year
0 University of Delaware, Newark, United States Nikolakis Vladimiros US 2589 2012
1 University of Delaware, Newark, United States Vlachos Dionisios G. US 2589 2012
2 IRCCS Fondazione Salvatore Maugeri, Pavia, Italy Moscato Gianna IT 2595 2011
3 Centro de Regulacion Genomica, Barcelona, Spain Nikolaou Christoforos ES 2602 2009
4 Goethe Univ Frankfurt, Dept Neurosurg, Kliniku... Archavlis Eleftherios DE 2603 2013
In [9]:
# Drop out the unknown counries since the rows with unknown countries are negligible
total_rows_before = df['item_id'].count()
df=df.loc[df['country'] != 'UNKNOWN']
total_rows_after = df['item_id'].count()
dropped = total_rows_before - total_rows_after
percentage = "%.4f%%" % (float(dropped) / float(total_rows_before))
print("Dropped " + str(dropped) + " (" + percentage + ") values with unknown countries")
Dropped 455 (0.0018%) values with unknown countries
In [10]:
df.groupby(['year'])['year'].count().sort_values().iplot(kind='bar',xTitle='Years', title='Authors per year')
In [11]:
df1 = df[['country']]
df1.groupby(['country'])['country'].count().sort_values().iplot(kind='barh',yTitle='Country', title='Multiple counts per paper',dimensions=(1000,2500))
In [12]:
df2 = df.drop_duplicates(subset=['country','item_id'])[['country']]
df2.groupby(['country'])['country'].count().sort_values().iplot(kind='barh',yTitle='Country', title='Single counts per paper',dimensions=(1000,2500))
In [13]:
df3 = df.drop_duplicates(subset=['country','author'])[['country']]
df3.groupby(['country'])['country'].count().sort_values().iplot(kind='barh',yTitle='Country', title='Single counts per author',dimensions=(1000,2500))
In [14]:
plot1_df=pandas.DataFrame({'count' : df1.groupby(by=["country"],sort=True).size()}).sort_values(by='count',ascending=True).reset_index()
plot2_df=pandas.DataFrame({'count' : df2.groupby(by=["country"],sort=True).size()}).sort_values(by='count',ascending=True).reset_index()
plot3_df=pandas.DataFrame({'count' : df3.groupby(by=["country"],sort=True).size()}).sort_values(by='count',ascending=True).reset_index()

iplot({
  'data': [
    Bar(**{
      'y': plot1_df['country'],
      'x': plot1_df['count'],
      'name': 'multiple counts',
      'orientation': 'h'
    }),
    Bar(**{
      'y': plot2_df['country'],
      'x': plot2_df['count'],
      'name': 'count paper once',
      'orientation': 'h'
    }),
    Bar(**{
      'y': plot3_df['country'],
      'x': plot3_df['count'],
      'name': 'count author once',
      'orientation': 'h'
    })
  ],
    'layout': Layout(**{
      'title': 'Compare multiple counts vs single paper or single author',
      'barmode': 'stack',
      'height': 3500,
      'width': 950
    })
})
In [15]:
# Drop out results from Greece
total_rows_before = df['item_id'].count()
df=df.loc[df['country'] != 'GR']
total_rows_after = df['item_id'].count()
dropped = total_rows_before - total_rows_after
percentage = "%.4f%%" % (float(dropped) / float(total_rows_before))

print("Dropped " + str(dropped) + " (" + percentage + ") values with country Greece")
Dropped 239 (0.0009%) values with country Greece
In [16]:
# Write the final csv
csv_df = df.drop_duplicates(subset=['country','author','year']).groupby(["country","year"]).size().to_frame(name = 'count').sort_values(by='count',ascending=False).reset_index()
print('total csv lines: ' + str(csv_df['country'].count()))
csv_df.to_csv(path_or_buf='../public/data/greek_publications.csv',index=False)
total csv lines: 723
In [17]:
# Append the Greek data to csv file
greek_data = [
  "GR,2008,21297",
  "GR,2009,22057",
  "GR,2010,21756",
  "GR,2011,22127",
  "GR,2012,22019",
  "GR,2013,22399",
  "GR,2014,22660",
  "GR,2015,21651"
]
print(greek_data)
with open("../public/data/greek_publications.csv", "a") as file:
  file.write("\n".join(greek_data))
['GR,2008,21297', 'GR,2009,22057', 'GR,2010,21756', 'GR,2011,22127', 'GR,2012,22019', 'GR,2013,22399', 'GR,2014,22660', 'GR,2015,21651']
In [ ]: